#install.packages("wesanderson")
# Load
library(wesanderson)
## Warning: package 'wesanderson' was built under R version 4.1.3
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.1.2
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forecast)
## Warning: package 'forecast' was built under R version 4.1.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(caret)
## Warning: package 'caret' was built under R version 4.1.2
## Loading required package: ggplot2
## Loading required package: lattice
library(rpart)
## Warning: package 'rpart' was built under R version 4.1.2
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.2
##loading csv files
train <-read.csv("D:/Projects/DemandForecasting/train_GzS76OK/train.csv")
fulfilment <-read.csv("D:/Projects/DemandForecasting/train_GzS76OK/fulfilment_center_info.csv")
meal <-read.csv("D:/Projects/DemandForecasting/train_GzS76OK/meal_info.csv")
test <-read.csv("D:/Projects/DemandForecasting/test_QoiMO9B.csv")
###Merging the files
Train1<-merge(train,fulfilment,by='center_id')
trainfull <- merge(Train1,meal, by='meal_id')
trainfull
Test1<-merge(test,fulfilment,by='center_id')
testfull <- merge(Test1,meal, by='meal_id')
testfull
###Exploratory Data Analysis ##Summary Statistics
summary(trainfull)
## meal_id center_id id week
## Min. :1062 Min. : 10.00 Min. :1000000 Min. : 1.00
## 1st Qu.:1558 1st Qu.: 43.00 1st Qu.:1124999 1st Qu.: 39.00
## Median :1993 Median : 76.00 Median :1250184 Median : 76.00
## Mean :2024 Mean : 82.11 Mean :1250096 Mean : 74.77
## 3rd Qu.:2539 3rd Qu.:110.00 3rd Qu.:1375140 3rd Qu.:111.00
## Max. :2956 Max. :186.00 Max. :1499999 Max. :145.00
## checkout_price base_price emailer_for_promotion homepage_featured
## Min. : 2.97 Min. : 55.35 Min. :0.00000 Min. :0.0000
## 1st Qu.:228.95 1st Qu.:243.50 1st Qu.:0.00000 1st Qu.:0.0000
## Median :296.82 Median :310.46 Median :0.00000 Median :0.0000
## Mean :332.24 Mean :354.16 Mean :0.08115 Mean :0.1092
## 3rd Qu.:445.23 3rd Qu.:458.87 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :866.27 Max. :866.27 Max. :1.00000 Max. :1.0000
## num_orders city_code region_code center_type
## Min. : 13.0 Min. :456.0 Min. :23.00 Length:456548
## 1st Qu.: 54.0 1st Qu.:553.0 1st Qu.:34.00 Class :character
## Median : 136.0 Median :596.0 Median :56.00 Mode :character
## Mean : 261.9 Mean :601.6 Mean :56.61
## 3rd Qu.: 324.0 3rd Qu.:651.0 3rd Qu.:77.00
## Max. :24299.0 Max. :713.0 Max. :93.00
## op_area category cuisine
## Min. :0.900 Length:456548 Length:456548
## 1st Qu.:3.600 Class :character Class :character
## Median :4.000 Mode :character Mode :character
## Mean :4.084
## 3rd Qu.:4.500
## Max. :7.000
summary(testfull)
## meal_id center_id id week
## Min. :1062 Min. : 10.0 Min. :1000085 Min. :146.0
## 1st Qu.:1558 1st Qu.: 43.0 1st Qu.:1123969 1st Qu.:148.0
## Median :1993 Median : 76.0 Median :1247296 Median :150.0
## Mean :2032 Mean : 81.9 Mean :1248476 Mean :150.5
## 3rd Qu.:2569 3rd Qu.:110.0 3rd Qu.:1372971 3rd Qu.:153.0
## Max. :2956 Max. :186.0 Max. :1499996 Max. :155.0
## checkout_price base_price emailer_for_promotion homepage_featured
## Min. : 67.9 Min. : 89.24 Min. :0.00000 Min. :0.00000
## 1st Qu.: 214.4 1st Qu.: 243.50 1st Qu.:0.00000 1st Qu.:0.00000
## Median : 320.1 Median : 321.13 Median :0.00000 Median :0.00000
## Mean : 341.9 Mean : 356.49 Mean :0.06644 Mean :0.08136
## 3rd Qu.: 446.2 3rd Qu.: 455.93 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1113.6 Max. :1112.62 Max. :1.00000 Max. :1.00000
## city_code region_code center_type op_area
## Min. :456.0 Min. :23.00 Length:32573 Min. :0.900
## 1st Qu.:556.0 1st Qu.:34.00 Class :character 1st Qu.:3.600
## Median :596.0 Median :56.00 Mode :character Median :4.000
## Mean :601.5 Mean :56.71 Mean :4.088
## 3rd Qu.:651.0 3rd Qu.:77.00 3rd Qu.:4.500
## Max. :713.0 Max. :93.00 Max. :7.000
## category cuisine
## Length:32573 Length:32573
## Class :character Class :character
## Mode :character Mode :character
##
##
##
max(trainfull$num_orders)
## [1] 24299
trainf<-subset(trainfull, num_orders!=24299)
###Checking for Null values
missing_values<-summarise_all(trainf,funs(sum(is.na(.))/n()))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
missing_values<-list(missing_values,key="feature",value="MissingPoints")
###Visualizing Number of Orders
plot(trainf$checkout_price,trainf$num_orders, col= "blue", main="Number of Orders Processed Vs CheckoutPrice")
###Visualizing Number of Orders vsWeek numbers
plot(trainf$week,trainf$num_orders, col="red",main="No of Orders processed vs Week numbers")
###Visualizing Number of Orders vs Meal id
plot(trainf$meal_id,trainf$num_orders,col='dark Green', main="No of Orders procesed vs Meal id")
###Visualizing Number of Orders vs Center id
plot(trainf$center_id,trainf$num_orders,col='blue', main="No of Orders processed vs Center id")
###Visualizing Number of Orders vs emailer for promotion
g<-ggplot(data=trainf)
g<-g+geom_bar(stat="identity",aes(x=emailer_for_promotion,y=num_orders,fill=emailer_for_promotion))
g<-g+xlab("Emailer For promotion")+ylab("Number of orders")+theme(legend.position = "none")+scale_y_continuous(name="Number of orders", labels = scales::comma)
g
###Visualizing Number of Orders vs Homepage featured
g<-ggplot(data=trainf)
g<-g+geom_bar(stat="identity",aes(x=homepage_featured, y= num_orders,fill=homepage_featured))
g<-g+xlab("Home Page featured")+ylab("Number of orders")+theme(legend.position = "none")
g+scale_y_continuous(name="Number of orders", labels = scales::comma)
###Visualizing Number of Orders vs Category
g<-ggplot(data=trainf)
g<-g+geom_bar(stat="identity",aes(x=category, y= num_orders,fill=category))
g<-g+xlab("Category")+ylab("Number of orders")+theme(legend.position = "none")
g+scale_y_continuous(name="Number of orders", labels = scales::comma)
###Visualizing Number of Orders vs Cuisine
g<-ggplot(data=trainf)
g<-g+geom_bar(stat="identity",aes(x=cuisine, y= num_orders,fill=cuisine))
g<-g+xlab("Cuisine")+ylab("Number of orders")+theme(legend.position = "none")
g+scale_y_continuous(name="Number of orders", labels = scales::comma)
###Visualizing Number of Orders vs Center_type
g<-ggplot(data=trainf)
g<-g+geom_bar(stat="identity",aes(x=center_type, y= num_orders,fill=center_type))
g<-g+xlab("Center_type")+ylab("Number of orders")+theme(legend.position = "none")
g+scale_y_continuous(name="Number of units", labels = scales::comma)
###Correlation matrix
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.2
## corrplot 0.92 loaded
M<-cor(trainf[, unlist(lapply(trainf, is.numeric))],method="pearson")
M
## meal_id center_id id week
## meal_id 1.0000000000 0.009893983 4.869528e-04 0.0198162535
## center_id 0.0098939827 1.000000000 2.643985e-03 -0.0034529968
## id 0.0004869528 0.002643985 1.000000e+00 0.0022303549
## week 0.0198162535 -0.003452997 2.230355e-03 1.0000000000
## checkout_price 0.0107490579 0.001345637 1.944697e-03 0.0265774054
## base_price 0.0026054232 0.000603923 2.910267e-03 0.0286127377
## emailer_for_promotion 0.0133984674 0.013664481 1.990718e-03 -0.0008288659
## homepage_featured 0.0163515596 -0.005037491 3.102039e-03 -0.0082526328
## num_orders 0.0105749006 -0.053136539 5.177228e-04 -0.0170555268
## city_code -0.0031975495 0.061077510 -3.861266e-04 0.0004040175
## region_code -0.0016619278 -0.003426370 -6.480134e-04 0.0045993911
## op_area -0.0015472583 -0.111867232 -9.263352e-05 0.0015530210
## checkout_price base_price emailer_for_promotion
## meal_id 0.010749058 0.002605423 0.0133984674
## center_id 0.001345637 0.000603923 0.0136644812
## id 0.001944697 0.002910267 0.0019907183
## week 0.026577405 0.028612738 -0.0008288659
## checkout_price 1.000000000 0.953389824 0.0048260946
## base_price 0.953389824 1.000000000 0.1711766436
## emailer_for_promotion 0.004826095 0.171176644 1.0000000000
## homepage_featured -0.057177959 0.057158546 0.3905210374
## num_orders -0.283103314 -0.223173519 0.2778266152
## city_code -0.004805222 -0.002054225 -0.0052325861
## region_code -0.003648476 -0.001934504 -0.0074615272
## op_area 0.021571634 0.018031715 -0.0194693973
## homepage_featured num_orders city_code
## meal_id 0.016351560 0.0105749006 -0.0031975495
## center_id -0.005037491 -0.0531365394 0.0610775099
## id 0.003102039 0.0005177228 -0.0003861266
## week -0.008252633 -0.0170555268 0.0004040175
## checkout_price -0.057177959 -0.2831033139 -0.0048052224
## base_price 0.057158546 -0.2231735189 -0.0020542250
## emailer_for_promotion 0.390521037 0.2778266152 -0.0052325861
## homepage_featured 1.000000000 0.2953075422 0.0086416326
## num_orders 0.295307542 1.0000000000 0.0417882255
## city_code 0.008641633 0.0417882255 1.0000000000
## region_code 0.003605678 0.0298695703 0.0426857134
## op_area 0.041492872 0.1775709033 0.1314765884
## region_code op_area
## meal_id -0.0016619278 -1.547258e-03
## center_id -0.0034263704 -1.118672e-01
## id -0.0006480134 -9.263352e-05
## week 0.0045993911 1.553021e-03
## checkout_price -0.0036484756 2.157163e-02
## base_price -0.0019345044 1.803171e-02
## emailer_for_promotion -0.0074615272 -1.946940e-02
## homepage_featured 0.0036056776 4.149287e-02
## num_orders 0.0298695703 1.775709e-01
## city_code 0.0426857134 1.314766e-01
## region_code 1.0000000000 2.332746e-02
## op_area 0.0233274638 1.000000e+00
corrplot(M,method="circle")
##Building Random Forest ML Model
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.2
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
rf<-randomForest(num_orders~meal_id+checkout_price+base_price+emailer_for_promotion+homepage_featured+center_type+cuisine,data=trainf,ntree=50)
rf
##
## Call:
## randomForest(formula = num_orders ~ meal_id + checkout_price + base_price + emailer_for_promotion + homepage_featured + center_type + cuisine, data = trainf, ntree = 50)
## Type of random forest: regression
## Number of trees: 50
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 54996.51
## % Var explained: 64.63
summary(rf)
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 456547 -none- numeric
## mse 50 -none- numeric
## rsq 50 -none- numeric
## oob.times 456547 -none- numeric
## importance 7 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 456547 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
###Splitting the test data into Public and Private
Private.index <- sample(c(1:dim(testfull)[1]), dim(testfull)[1]*0.7)
Public.index <- setdiff(c(1:dim(testfull)[1]), Private.index)
Privatet.df <- testfull[Private.index, ]
Public.df <- testfull[Public.index, ]
##Predicting on Train Data
library(dplyr)
trainf$predRF<-predict(rf,data=trainf)
head(select(trainf,c(predRF,num_orders)))
rmsle<- sqrt(1/length(trainf$num_orders)*sum((log(trainf$predRF +1)-log(trainf$num_orders +1))^2))
##Copying the Predicted values of test data to csv
predict_test <-predict(rf,newda=testfull)
Public_predict<-data.frame(testfull$id,num_orders=predict_test)
write.csv(Public_predict,"Public_predict_1.csv",row.names = F)